{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 首先 import 必要的模块\n",
    "import pandas as pd #读数据，对数据进行分析\n",
    "import numpy as np #存数组和矩阵\n",
    "\n",
    "import matplotlib.pyplot as plt #可视化\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>企业类别标签</th>\n",
       "      <th>content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   企业类别标签                                            content\n",
       "0       2  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...\n",
       "1       2  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。\n",
       "2       1  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...\n",
       "3       2  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...\n",
       "4       2  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专..."
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读取数据\n",
    "train = pd.read_csv('./training.csv',header=None,names=['企业类别标签','content'],encoding=\"utf-8\") \n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4774 entries, 0 to 4773\n",
      "Data columns (total 2 columns):\n",
      " #   Column   Non-Null Count  Dtype \n",
      "---  ------   --------------  ----- \n",
      " 0   企业类别标签   4774 non-null   int64 \n",
      " 1   content  4774 non-null   object\n",
      "dtypes: int64(1), object(1)\n",
      "memory usage: 74.7+ KB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "企业类别标签     0\n",
      "content    0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(train.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.975 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "import jieba \n",
    "\n",
    "content = train.content.values.tolist()\n",
    "all_list= ['  '.join(jieba.cut(line,cut_all = False)) for line in content]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#从文件导入停用词表\n",
    "stpwrdpath =\"./stopwords.txt\"\n",
    "with open(stpwrdpath, 'rb') as fp:\n",
    "#    stopword = fp.read().decode('utf-8')  # 提用词提取\n",
    "    stopword = fp.read()\n",
    "#将停用词表转换为list  \n",
    "stpwrdlst = stopword.splitlines()\n",
    "\n",
    "# 从sklearn.feature_extraction.text里导入CountVectorizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer \n",
    "\n",
    "vectorizer = CountVectorizer(stop_words=stpwrdlst)#将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 \n",
    "transformer = TfidfTransformer() #该类会统计每个词语的tf-idf权值 \n",
    "tfidf = transformer.fit_transform(vectorizer.fit_transform(all_list))\n",
    "\n",
    "weight = tfidf.toarray()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Kmeans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "from sklearn.preprocessing import normalize  \n",
    "from sklearn.cluster import KMeans #用minibatchkmeans不需要一下子把数据都读进来\n",
    "from sklearn import metrics #metrics里有对聚类算法进行评价的指标\n",
    "\n",
    "\n",
    "# 一个参数点（聚类数据为K）的模型\n",
    "def K_cluster_analysis(K, X):\n",
    "    print(\"K-means begin with clusters: {}\".format(K));\n",
    "    \n",
    "    #K-means,在训练集上训练\n",
    "    mb_kmeans = KMeans(n_clusters = K)\n",
    "    y_pred = mb_kmeans.fit_predict(X)\n",
    "    \n",
    "    # K值的评估标准\n",
    "    \n",
    "    #采用无参考默的评价指标：Calinski-Harabasz Index，值越大则聚类效果越好\n",
    "    CH_score = metrics.calinski_harabaz_score(X, y_pred) #(类间距离除以类内距离，所以越大越好)\n",
    "    \n",
    "    #轮廓系数Silhouette Coefficient在大样本时计算太慢\n",
    "    #si_score = metrics.silhouette_score(X, y_pred)\n",
    "    \n",
    "    print(\"CH_score: {}\".format(CH_score))\n",
    "    \n",
    "    return CH_score#,si_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters: 5\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\anaconda3\\envs\\tf1.14\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CH_score: 32.445532294620804\n",
      "K-means begin with clusters: 10\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\anaconda3\\envs\\tf1.14\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CH_score: 23.897279560921998\n",
      "K-means begin with clusters: 15\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\anaconda3\\envs\\tf1.14\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CH_score: 18.699239306169243\n",
      "K-means begin with clusters: 20\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\anaconda3\\envs\\tf1.14\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CH_score: 15.81585433910956\n",
      "K-means begin with clusters: 30\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\anaconda3\\envs\\tf1.14\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CH_score: 12.838826151530071\n",
      "K-means begin with clusters: 40\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\anaconda3\\envs\\tf1.14\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CH_score: 10.716610716502235\n",
      "K-means begin with clusters: 50\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\anaconda3\\envs\\tf1.14\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CH_score: 9.741172972128503\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数（聚类数目K）搜索范围\n",
    "Ks = [5,10,15,20,30,40,50]\n",
    "CH_scores = []\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, weight)\n",
    "    CH_scores.append(ch)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAD4CAYAAAD1jb0+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAZcElEQVR4nO3de3RV5ZnH8e+DEJGLihApQiRqKYIWQ5tSOlTrhTAuxnqZqQ7UKm1VXGM7hRYcL1M76owdRBCwtla8tNRRi7ZYvLAqlEvRqowBEVDsoAMqd1RcFstF8Jk/3pNJggk5CeecN/vs32ets5Kzc7LO414rP7bP2c/7mrsjIiLJ0yZ2ASIi0jIKcBGRhFKAi4gklAJcRCShFOAiIgnVtpBv1q1bNy8vLy/kW4qIJN7SpUvfcffS/Y8XNMDLy8uprq4u5FuKiCSemb3Z0HG1UEREEkoBLiKSUApwEZGEUoCLiCSUAlxEJKEU4CIiCaUAFxFJqEQE+Pz5MGFC7CpERFqXRAT4738PP/whrFsXuxIRkdYjEQE+ZgyYwbRpsSsREWk9EhHgvXrByJFwzz2wfXvsakREWodEBDjAuHHw4Ydw992xKxERaR0SE+CnnALDhsEdd8Du3bGrERGJLzEBDjB+PGzaBA89FLsSEZH4EhXgQ4eGK/FJk8A9djUiInElKsDNwlX4q6+GWwtFRNIsUQEO8I//CD17wm23xa5ERCSuxAV4u3YwdiwsXAhLl8auRkQknsQFOMDo0XD44TB5cuxKRETiSWSAH354CPFHHoE3G9wpTkSk+CUywAG+973woebUqbErERGJI7EBXlam8XoRSbfEBjhovF5E0i3RAX7KKVBVpfF6EUmnRAc4wNVXh/H6hx+OXYmISGElPsCHDoUBAzReLyLp02SAm1l7M/tvM3vZzF4xs5syx48zsyVmtsbMZppZSf7Lbai+MF7/yisarxeRdMnmCnw3cKa7nwJUAGeb2WDgVmCKu/cBtgOX5a/MAxsxIozXT5oUqwIRkcJrMsA92JF52i7zcOBM4DeZ4zOA8/NSYRZqxusXLIBly2JVISJSWFn1wM3sEDNbDmwF5gFvAO+7+97MS9YDPfNTYnauuAI6d9ZVuIikR1YB7u773L0C6AUMAvo19LKGftfMRptZtZlVb9u2reWVNuGII+DKKzVeLyLp0ay7UNz9fWARMBg40szaZn7UC9jYyO9Md/dKd68sLS09mFqbpPF6EUmTbO5CKTWzIzPfHwYMBVYDC4GvZV42CpidryKzVVYWPtDUeL2IpEE2V+A9gIVmtgJ4EZjn7k8C1wA/MLPXga7AffkrM3vjx4fx+unTY1ciIpJf5gWcfqmsrPTq6uq8v8+wYbBqFaxdC4cemve3ExHJKzNb6u6V+x9P/CRmQ2p2r9d4vYgUs6IM8KoqjdeLSPErygDXeL2IpEFRBjjU7l6vwR4RKVZFG+AlJRqvF5HiVrQBDhqvF5HiVtQBfsQR2r1eRIpXUQc4wJgx4UPNadNiVyIikltFH+B1x+vffz92NSIiuVP0AQ7hlsIdO7R7vYgUl1QEeM3u9dOmwZ49sasREcmNVAQ41I7XP/RQ7EpERHIjNQGu8XoRKTapCfC64/VPPx27GhGRg5eaAIfa8frbbotdiYjIwUtVgJeUhPvCNV4vIsUgVQEOYTKzc2eYPDl2JSIiByd1AV4zXj9zpsbrRSTZUhfgoPF6ESkOqQxwjdeLSDFIZYADjBun8XoRSbbUBnhFBQwdqvF6EUmu1AY4wNVXa/d6EUmuVAd4VRV89rMarxeRZEp1gNeM169apfF6EUmeVAc4hLtRtHu9iCRR6gO8Zrx+/nyN14tIsqQ+wEHj9SKSTApw6o/Xv/VW7GpERLKjAM+oGa+fOjV2JSIi2VGAZ5SVhfXCNV4vIkmhAK+jZvf66dNjVyIi0jQFeB0arxeRJFGA72f8eNi4UeP1ItL6KcD3M2yYxutFJBmaDHAzKzOzhWa22sxeMbMxmeM3mtkGM1ueeQzPf7n5p/F6EUmKbK7A9wLj3L0fMBj4jpn1z/xsirtXZB5z8lZlgWm8XkSSoMkAd/dN7r4s8/1fgNVAz3wXFlPd8fqXXopdjYhIw5rVAzezcmAgsCRz6LtmtsLM7jezLo38zmgzqzaz6m3bth1UsYVUM16vq3ARaa2yDnAz6wT8Fhjr7h8AdwEnABXAJqDBlUTcfbq7V7p7ZWlpaQ5KLowjjoArrtB4vYi0XlkFuJm1I4T3g+4+C8Ddt7j7Pnf/GLgHGJS/MuMYO1a714tI65XNXSgG3Aesdvfb6xzvUedlFwCrcl9eXDXj9dOna7xeRFqfbK7AhwCXAGfud8vgRDNbaWYrgDOA7+ez0Fg0Xi8irZV5AadVKisrvbq6umDvlytVVfDqq7B2bbhDRUSkkMxsqbtX7n9ck5hZ0Hi9iLRGCvAsaLxeRFojBXgW6o7Xz50buxoRkUABnqWa8fobboCPPopdjYiIAjxrJSUwZQq8+CJcf33sakREFODNcuGFcNVVoRf+5JOxqxGRtFOAN9PkyWHnnlGj4O23Y1cjImmmAG+m9u3hkUfClmsjRqgfLiLxKMBboE+fsHv9c8+FDzVFRGJQgLfQiBFw5ZVw660wp2i2shCRJFGAH4QpU2DAALj0Uli/PnY1IpI2CvCDcNhhoR++axeMHAl798auSETSRAF+kPr2hbvvhmefhR/9KHY1IpImCvAcuPhiuPxy+M//1E72IlI4CvAcmTYNTj4ZvvEN2LAhdjUikgYK8Bzp0AEefRR27oSvf139cBHJPwV4Dp14Itx1FyxeDDfdFLsaESl2CvAcu+QS+Na34JZb4A9/iF2NiBQzBXge3Hkn9O8fPtzctCl2NSJSrBTgedChQ7g/fMeOEOL79sWuSESKkQI8T/r3h5/+FBYuhH//99jViEgxUoDn0Te/Gcbsb74ZFiyIXY2IFBsFeJ797Gfh7pSvfx02b45djYgUEwV4nnXsGPrhH3wQhnzUDxeRXFGAF8DJJ8NPfgLz58OPfxy7GhEpFgrwAvn2t8MV+I03wqJFsasRkWKgAC8QszCl2adP6Idv3Rq7IhFJOgV4AXXqFPrh27eHic2PP45dkYgkmQK8wAYMgDvugLlzYcKE2NWISJIpwCO4/PKwg88NN4SFr0REWkIBHoFZ2MXnhBNCkG/bFrsiEUkiBXgknTuHfvi774ZpTfXDRaS5FOARVVTA1Knw+9/DxImxqxGRpFGAR3bllXDRRfDDH4aNkUVEstVkgJtZmZktNLPVZvaKmY3JHD/KzOaZ2ZrM1y75L7f4mME990B5eeiHv/NO7IpEJCmyuQLfC4xz937AYOA7ZtYfuBaY7+59gPmZ59IChx8e+uFbt8KoUeqHi0h2mgxwd9/k7ssy3/8FWA30BM4DZmReNgM4P19FpsHnPge33w5z5sDkybGrEZEkaFYP3MzKgYHAEqC7u2+CEPLA0bkuLm2uugq+9jW47jp47rnY1YhIa5d1gJtZJ+C3wFh3/6AZvzfazKrNrHqbbng+IDO4917o3RtGjAi3GIqINCarADezdoTwftDdZ2UObzGzHpmf9wAaXJ7J3ae7e6W7V5aWluai5qJ2xBEwc2bY/OFb3wL32BWJSGuVzV0oBtwHrHb32+v86HFgVOb7UcDs3JeXTpWVMGkSPPEETJkSuxoRaa2yuQIfAlwCnGlmyzOP4cAEoMrM1gBVmeeSI//8z3DBBXDNNbBkSexqRKQ1Mi/g/6NXVlZ6dXV1wd4v6bZvD3enuMNLL0EX3WkvkkpmttTdK/c/rknMVqxLl9AP37hR/XAR+SQFeCs3aBDceivMnh3WERcRqaEAT4CxY+Hcc+Hqq+HFF2NXIyKthQI8AczgF7+AHj3Cwlfvvx+7IhFpDRTgCXHUUaEfvn49XHaZ+uEiogBPlMGDwz6as2aFcft9+2JXJCIxKcAT5gc/CFfgt94KVVXhDhURSScFeMLUrJfyi1+EAZ+KCnj66dhViUgMCvCE+uY3wx0p3bvD2WeHlspHH8WuSkQKSQGeYP37h6vwK64IvfHTT4e33opdlYgUigI84Tp0gOnT4aGHYMWK0FJ5/PHYVYlIISjAi8TIkbBsWdhb87zzwoede/bErkpE8kkBXkT69IHnnw8rGU6ZAkOGwP/+b+yqRCRfFOBF5tBDw5ops2bB66/DwIHw6KOxqxKRfFCAF6kLLghL0PbrF8bvr7oKdu2KXZWI5JICvIiVl8Mzz4RFsO66C774Rfjzn2NXJSK5ogAvcu3awcSJ8NRTsGEDfP7z8MADsasSkVxQgKfE8OGwfHnY4efSS8MGER9+GLsqETkYCvAU6dULFiyAG26AGTPgC1+AVatiVyUiLaUAT5m2beHmm2HePHjvvRDi996r5WlFkkgBnlJnnQUvvwxf/nIYxb/4Yvjgg9hViUhzKMBTrHv3sJLhLbeEzSI+//kwzSkiyaAAT7k2beD662HRIti5E770JbjzTrVURJJAAS4AnHpquEulqiqM4v/DP8D27bGrEpEDUYDL/+vWLaxkOGkSPPFEGMN/4YXYVYlIYxTgUk+bNjBuHDz7bNj959RTQ6B//HHsykRkfwpwadAXvxjWUjn33DCK/9WvwjvvxK5KROpSgEujjjwSfvMb+OlP4Q9/gFNOgcWLY1clIjUU4HJAZmElwxdegI4d4Ywz4D/+A/bti12ZiCjAJSsDB8LSpTBiRBjF/9u/hc2bY1clkm4KcMla587wX/8F990Hzz0XWipTpsC2bbErE0knBbg0ixl8+9vw4ovw6U+HvTePOSbcN/7UU7B3b+wKRdJDAS4tctJJ8Kc/hdUMx4wJG0eccw4ceyxcdx38z//ErlCk+CnA5aCcdFK4T3z9enjssbCeysSJ0LcvnHYa/PKXsGNH7CpFipMCXHKipATOPz9McL79NkyYAFu2hI0jevSAyy8PfXOtsSKSO00GuJndb2ZbzWxVnWM3mtkGM1ueeQzPb5mSJMccA9dcA6+9FiY6L7oIfv1rGDIkbLI8caLuYBHJhWyuwH8JnN3A8SnuXpF5zMltWVIMzEJo33dfCOz774fS0hDuvXqFKc/Zs+Gjj2JXKpJMTQa4uy8G3itALVLEOnUK7ZRnnglX5uPHhztZzj8/hPnVV8Orr8auUiRZDqYH/l0zW5FpsXRp7EVmNtrMqs2septuGBbCB5wTJoRe+RNPhKv0qVPDB6Jf+hLcc492BxLJRksD/C7gBKAC2ARMbuyF7j7d3SvdvbK0tLSFbyfFqG3bcOvhrFmwYQNMnhyCe/Ro+NSnYNSosPaKPvgUaViLAtzdt7j7Pnf/GLgHGJTbsiRtjj46DAWtWgVLlsCll8Lvfgdf+Qp85jPw4x+HkBeRWi0KcDPrUefpBcCqxl4r0hxmMGgQ/PznsGkT/OpXoUf+r/8ahoSGDw8rJO7eHbtSkfiyuY3wYeB5oK+ZrTezy4CJZrbSzFYAZwDfz3OdkkIdOsAll8DChfD662HvzpUr4cILoWdP+P73w3ORtDIvYIOxsrLSq6urC/Z+Unz27Qtrk99/f2ix7NkDlZVhfZaRI8Ma5iLFxsyWunvl/sc1iSmJcsghYSnbmTNh40aYNi2E+FVXhYnPiy+G+fO1BZykgwJcEqtrV/je92D58rBW+WWXwZw5MHQoHH883HQTvPlm7CpF8kcBLolnBp/7HNx5Z/jg8+GHw50rN90Exx0Hw4aFUf5du2JXKpJbCnApKu3bh12D5s6FtWvhxhthzZrQH+/RA777XVi2TPeWS3FQgEvR6t0bfvQjeOON8MHn8OFw771hyduBA+GOO+Ddd2NXKdJyCnApem3awFlnwYMPhhbLz34G7dqFjSiOOSaslvj009qoWZJHAS6p0qUL/NM/hYW0Xn453L2yYAGcfTaUl4cNm994I3aVItlRgEtqDRgQNmXesCFMd372s2Fk/9OfhjPOgAcegL/+NXaVIo1TgEvqHXpo2JR5zpxw2+Ett4SVEi+9NHzweeWVYX0WffAprY0CXKSOXr3CyP6aNfDHP4b1yh94AAYPDlfot98OW7fGrlIkUICLNMAsbMo8Y0bYTWj6dOjcGcaNC+uw/P3fw5NPwt69sSuVNFOAizTh8MPhiivg+efhlVdg7Fj405/gq1+FsjK49lr4859jVylppAAXaYb+/eG222D9+rCY1qBBMGkSnHhiaLPcfDO88IKuzKUwtBqhyEHavDmsW/7oo2FNFvewKuJZZ4Ux/mHDwi2KIi3V2GqECnCRHHr33bAa4ty5YTho/fpwvE+f2jA//fTQlhHJlgJcpMDcQ2987tzwWLgw3Ffetm1ot9QEemVlWCZXpDEKcJHIdu8OH4TWBHrNolpdutRvt/TuHbtSaW0U4CKtzDvv1LZb5s6tbbd85jP12y2dO0ctU1oBBbhIK+YOr71WG+aLFtW2W/7mb0KYV1WFlRTVbkkfBbhIguzeDc89V7/dAqHdMnRo7RX6scfGrVMKQwEukmDbtoU1zWsCfePGcLxv3/rtlk6dopYpeaIAFykS7rB6df12y86dYY3zmnbLsGFh0wq1W4qDAlykSO3aVb/d8tJL4XjXrrXtlqqqMPYvyaQAF0mJrVvrt1s2bQrH+/ULQT5sGHzlK2q3JIkCXCSF3MMCXDVh/sc/hiv2du1gyJD67ZY2Whmp1VKAiwi7dsGzz8K8eSHQly8Px7t2rb06r6oK66JL66EAF5FP2LKlfrtl8+ZwvH//2qvz006Djh3j1pl2CnAROSB3WLWqNswXLw5X7CUl9dstFRVqtxSaAlxEmmXnztBuqQn0FSvC8W7d6rdbevaMW2caKMBF5KBs3ly/3bJlSzh+0kn12y0dOsStsxgpwEUkZ9xh5cr67Zbdu0O75dRTw5X5F74QNrIoKwt3vUjLKcBFJG927oRnnqkN9JUra3/Wpk24q6W8vOFHWVlYtEsa11iA67SJyEE77LDaNgqEdsvq1bBuXXisXRu+LlwYls2te914yCGNB/xxx4UeuwK+YTotIpJzn/pUeDRkz54Q4vuH+7p1YX30DRs+GfBlZQ2He3l5CPi0rvmiABeRgiopgeOPD4+G7NkDb7/9yXBfty4MIG3cWD/g27b9ZMDXhHt5ORxzTPEGfJMBbmb3A+cAW9395Myxo4CZQDmwDrjI3bfnr0wRSYuSEjjhhPBoyO7djQf800/XLrVbo23bsG56Q+FeXg49eiQ34Jv8ENPMTgN2AL+qE+ATgffcfYKZXQt0cfdrmnozfYgpIvm2a1cI+P3DveZRs7hXjXbtmg742INLLf4Q090Xm1n5fofPA07PfD8DWAQ0GeAiIvnWvj306RMeDdm1C958s+Fwf+qp2uUEapSU1Ab8/uFeXh56/bECvqU98O7uvgnA3TeZ2dGNvdDMRgOjAY7V/k8iEln79mEno759G/75zp0NB/zatTB7dliut66SEujdu/GA7949fwGf9w8x3X06MB1CCyXf7ycicjAOOwxOPDE8GvLXv34y4GvaNY89Fra/q+vQQ0PA33132PYul1oa4FvMrEfm6rsHsLXJ3xARKQIdOoTNMfr1a/jnH37YcMB365b7Wloa4I8Do4AJma+zc1aRiEiCdewYluPt3z//79VkZ8bMHgaeB/qa2Xozu4wQ3FVmtgaoyjwXEZECyuYulJGN/OisHNciIiLNoGXZRUQSSgEuIpJQCnARkYRSgIuIJJQCXEQkoRTgIiIJVdAt1cxsG/Bmwd4wP7oB78QuohXR+ailc1Gfzkd9B3M+ert76f4HCxrgxcDMqhta1jGtdD5q6VzUp/NRXz7Oh1ooIiIJpQAXEUkoBXjzTY9dQCuj81FL56I+nY/6cn4+1AMXEUkoXYGLiCSUAlxEJKEU4AdgZveb2VYzW1Xn2FFmNs/M1mS+dolZY6GYWZmZLTSz1Wb2ipmNyRxP6/lob2b/bWYvZ87HTZnjx5nZksz5mGlmJbFrLRQzO8TMXjKzJzPP03wu1pnZSjNbbmbVmWM5/1tRgB/YL4Gz9zt2LTDf3fsA8zPP02AvMM7d+wGDge+YWX/Sez52A2e6+ylABXC2mQ0GbgWmZM7HduCyiDUW2hhgdZ3naT4XAGe4e0Wde79z/reiAD8Ad18MvLff4fOAGZnvZwDnF7SoSNx9k7svy3z/F8Ifak/Sez7c3XdknrbLPBw4E/hN5nhqzoeZ9QL+Drg389xI6bk4gJz/rSjAm6+7u2+CEGrA0ZHrKTgzKwcGAktI8fnItAyWEzb1nge8Abzv7nszL1lP+EcuDaYC/wJ8nHnelfSeCwj/mM81s6VmNjpzLOd/Ky3d1FhSysw6Ab8Fxrr7B+FCK53cfR9QYWZHAo8BDe1TXvT36ZrZOcBWd19qZqfXHG7gpUV/LuoY4u4bzexoYJ6ZvZaPN9EVePNtMbMeAJmvWyPXUzBm1o4Q3g+6+6zM4dSejxru/j6wiPDZwJFmVnNh1AvYGKuuAhoCnGtm64BfE1onU0nnuQDA3Tdmvm4l/OM+iDz8rSjAm+9xYFTm+1HA7Ii1FEymp3kfsNrdb6/zo7Sej9LMlTdmdhgwlPC5wELga5mXpeJ8uPt17t7L3cuBEcACd7+YFJ4LADPraGada74HhgGryMPfiiYxD8DMHgZOJywDuQX4N+B3wCPAscBbwIXuvv8HnUXHzL4MPAOspLbPeT2hD57G8zGA8EHUIYQLoUfc/WYzO55wFXoU8BLwDXffHa/Swsq0UMa7+zlpPReZ/+7HMk/bAg+5+y1m1pUc/60owEVEEkotFBGRhFKAi4gklAJcRCShFOAiIgmlABcRSSgFuIhIQinARUQS6v8Av4LM8nm7hycAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制不同K对应的聚类的性能，找到最佳模型／参数（分数最高）\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.plot(Ks, np.array(CH_scores), 'b-',label = 'CH_scores')\n",
    "\n",
    "\n",
    "### 最佳超参数\n",
    "index = np.unravel_index(np.argmax(CH_scores, axis=None), len(CH_scores))\n",
    "Best_K = Ks[index[0]]\n",
    "\n",
    "print(Best_K)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "mb_kmeans = KMeans(n_clusters = Best_K)\n",
    "\n",
    "y_pred = mb_kmeans.fit_predict(weight)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 3, 4, ..., 3, 3, 4])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\anaconda3\\envs\\tf1.14\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function calinski_harabaz_score is deprecated; Function 'calinski_harabaz_score' has been renamed to 'calinski_harabasz_score' and will be removed in version 0.23.\n",
      "  warnings.warn(msg, category=FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CH_score: 32.59592869960693\n"
     ]
    }
   ],
   "source": [
    "CH_score = metrics.calinski_harabaz_score(weight, y_pred)\n",
    "print(\"CH_score: {}\".format(CH_score))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存聚类结果\n",
    "feat_names_Kmeans = \"Kmeans_\" + str(Best_K)\n",
    "train_id = train['企业类别标签']\n",
    "train_kmeans = pd.concat([train_id, pd.Series(name = feat_names_Kmeans, data = y_pred)], axis = 1)\n",
    "train_kmeans.to_csv('./tfidf_train_KMeans.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "CH_score随着cluster的增大而减小，最佳的聚类数目是5"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
